AAQoL machine learning analysis with unbalanced random forest

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)
library(ggRandomForests)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above")))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Demographics

ps(Ethnicity)
# A tibble: 7 × 3
  Ethnicity        n     pct
  <fct>        <int>   <dbl>
1 Chinese        639 24.5   
2 Asian Indian   574 22.0   
3 Filipino       265 10.2   
4 Korean         471 18.1   
5 Other          144  5.52  
6 Vietnamese     514 19.7   
7 <NA>             2  0.0767
ps(Gender)
# A tibble: 3 × 3
  Gender     n   pct
  <fct>  <int> <dbl>
1 Female  1425 54.6 
2 Male    1157 44.3 
3 <NA>      27  1.03
ps(Religion)
# A tibble: 8 × 3
  Religion       n    pct
  <fct>      <int>  <dbl>
1 Buddhist     350 13.4  
2 Catholic     492 18.9  
3 Hindu        479 18.4  
4 Muslim        68  2.61 
5 None         506 19.4  
6 Other         47  1.80 
7 Protestant   645 24.7  
8 <NA>          22  0.843
ps(`Full Time Employment`)
# A tibble: 3 × 3
  `Full Time Employment`     n    pct
  <fct>                  <int>  <dbl>
1 0                       1458 55.9  
2 Employed full time      1144 43.8  
3 <NA>                       7  0.268
ps(Income)
# A tibble: 9 × 3
  Income                n   pct
  <fct>             <int> <dbl>
1 $0 - $9,999         254  9.74
2 $10,000 - $19,999   205  7.86
3 $20,000 - $29,999   198  7.59
4 $30,000 - $39,999   207  7.93
5 $40,000 - $49,999   181  6.94
6 $50,000 - $59,999   178  6.82
7 $60,000 - $69,999   190  7.28
8 $70,000 and over    993 38.1 
9 <NA>                203  7.78
ps(`English Speaking`)
# A tibble: 5 × 3
  `English Speaking`     n    pct
  <fct>              <int>  <dbl>
1 Not at all           177  6.78 
2 Not well             632 24.2  
3 Very well            974 37.3  
4 Well                 808 31.0  
5 <NA>                  18  0.690
ps(`English Difficulties`)
# A tibble: 5 × 3
  `English Difficulties`     n   pct
  <fct>                  <int> <dbl>
1 Not at all               772 29.6 
2 Much                     549 21.0 
3 Not much                 733 28.1 
4 Very much                516 19.8 
5 <NA>                      39  1.49
ps(Discrimination)
# A tibble: 3 × 3
  Discrimination     n   pct
           <dbl> <int> <dbl>
1              0  1598  61.2
2              1   694  26.6
3             NA   317  12.2
qol |> summarize(age_mean = mean(Age,na.rm=T),
                 age_sd = sd(Age,na.rm=T),
                 age_min = min(Age,na.rm=T),
                 age_max = max(Age,na.rm=T))
# A tibble: 1 × 4
  age_mean age_sd age_min age_max
     <dbl>  <dbl>   <dbl>   <dbl>
1     42.9   17.1      18      98

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 
rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  # filter(!is.na(Family)) |> 
  na.omit() |>
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc",method="brf")
print(rfobj)
                         Sample size: 1926
           Frequency of class labels: 928, 998
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 528.6857
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swr
    Resample size used to grow trees: 1856
                            Analysis: RF-C
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0754
                   (OOB) Brier score: 0.23092348
        (OOB) Normalized Brier score: 0.92369391
                           (OOB) AUC: 0.65174638
                        (OOB) PR-AUC: 0.61532647
                        (OOB) G-mean: 0.60732292
   (OOB) Requested performance error: 0.39267708

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  610 318      0.3427
       Yes 438 560      0.4389

      (OOB) Misclassification rate: 0.3925234
print(rfobj)
                         Sample size: 1926
           Frequency of class labels: 928, 998
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 528.6857
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swr
    Resample size used to grow trees: 1856
                            Analysis: RF-C
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0754
                   (OOB) Brier score: 0.23092348
        (OOB) Normalized Brier score: 0.92369391
                           (OOB) AUC: 0.65174638
                        (OOB) PR-AUC: 0.61532647
                        (OOB) G-mean: 0.60732292
   (OOB) Requested performance error: 0.39267708

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  610 318      0.3427
       Yes 438 560      0.4389

      (OOB) Misclassification rate: 0.3925234
plot(rfobj,plots.one.page = FALSE)


                             all   No   Yes
Age                       0.0314   NA    NA
EnglishDiff               0.0159   NA    NA
Ethnicity                 0.0156   NA    NA
Trust                     0.0092   NA    NA
Employment                0.0089   NA    NA
Helpful.Family            0.0062   NA    NA
Discrimination            0.0059   NA    NA
Income_median             0.0059   NA    NA
EnglishSpeak              0.0055   NA    NA
Loyalty                   0.0051   NA    NA
Religion                  0.0049   NA    NA
See.Family                0.0048   NA    NA
Dental.Insurance          0.0047   NA    NA
Close.Family              0.0046   NA    NA
Togetherness              0.0044   NA    NA
Feel.Close                0.0040   NA    NA
Successful.Family         0.0036   NA    NA
Community.Shares.Values   0.0035   NA    NA
Religious.Attendance      0.0035   NA    NA
Expression                0.0035   NA    NA
Community.Trust           0.0034   NA    NA
Helpful.Community         0.0030   NA    NA
Similar.Values            0.0029   NA    NA
Spend.Time.Together       0.0012   NA    NA
Religious.Importance      0.0007   NA    NA
Close.Friends             0.0000   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity                1.561676e-02 NA  NA
Age                      3.136262e-02 NA  NA
Gender                  -1.289370e-03 NA  NA
Religion                 4.869572e-03 NA  NA
Employment               8.869707e-03 NA  NA
Income_median            5.881249e-03 NA  NA
EnglishSpeak             5.466686e-03 NA  NA
EnglishDiff              1.589785e-02 NA  NA
See.Family               4.815807e-03 NA  NA
Close.Family             4.648270e-03 NA  NA
Helpful.Family           6.200891e-03 NA  NA
See.Friends             -2.383014e-03 NA  NA
Close.Friends           -3.022316e-05 NA  NA
Helpful.Friends         -4.584730e-03 NA  NA
Family.Respect          -6.645626e-04 NA  NA
Similar.Values           2.851297e-03 NA  NA
Successful.Family        3.594840e-03 NA  NA
Trust                    9.233369e-03 NA  NA
Loyalty                  5.073923e-03 NA  NA
Family.Pride            -1.741601e-03 NA  NA
Expression               3.494694e-03 NA  NA
Spend.Time.Together      1.190567e-03 NA  NA
Feel.Close               3.958902e-03 NA  NA
Togetherness             4.427049e-03 NA  NA
Religious.Attendance     3.534035e-03 NA  NA
Religious.Importance     7.409379e-04 NA  NA
Close.knit.Community    -1.842641e-03 NA  NA
Helpful.Community        2.955803e-03 NA  NA
Community.Shares.Values  3.534035e-03 NA  NA
Get.Along               -1.307111e-03 NA  NA
Community.Trust          3.355233e-03 NA  NA
Health.Insurance        -8.533212e-05 NA  NA
Dental.Insurance         4.749503e-03 NA  NA
Discrimination           5.888430e-03 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100) |> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Cross validation in Random Forests (Run when you have time)

# myTrainingControl <- trainControl(method = "repeatedcv",
#                                   number = 10,                   
#                                   repeats = 3,                 
#                                   savePredictions = TRUE,
#                                   classProbs = TRUE,
#                                   verboseIter = TRUE,
#                                   search = "grid")
# 
# 
# set.seed(123)
# 
# model_rf <- train(Family~ .,
#                   data=rfdata,
#                   method = 'rf',
#                   metric = "Accuracy",             
#                   trControl = myTrainingControl,       
#                   importance = TRUE                 
#                   )
# 
# varImp(model_rf)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Family~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]
# 
# rfsrc(Family~.,data=train, importance="permute", 
#       perf.type="gmean",
#       splitrule="auc",
#       block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=train,
perf.type = "gmean",
ntree=1000,
splitrule="auc")

print(rfobj)
                         Sample size: 1542
           Frequency of class labels: 756, 786
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 335.866
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 975
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0397
                   (OOB) Brier score: 0.18674628
        (OOB) Normalized Brier score: 0.74698513
                           (OOB) AUC: 0.85467195
                        (OOB) PR-AUC: 0.85195658
                        (OOB) G-mean: 0.77096032
   (OOB) Requested performance error: 0.22903968

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 579 177      0.2341
       No  176 610      0.2239

      (OOB) Misclassification rate: 0.2289235
plot(rfobj,plots.one.page = FALSE)


                             all   Yes   No
Ethnicity                 0.0247    NA   NA
Close.Family              0.0186    NA   NA
EnglishDiff               0.0182    NA   NA
Discrimination            0.0167    NA   NA
Close.Friends             0.0135    NA   NA
Religious.Attendance      0.0131    NA   NA
Expression                0.0129    NA   NA
Employment                0.0122    NA   NA
Helpful.Friends           0.0112    NA   NA
Successful.Family         0.0111    NA   NA
Family.Respect            0.0078    NA   NA
Religion                  0.0078    NA   NA
Age                       0.0078    NA   NA
Religious.Importance      0.0077    NA   NA
Community.Shares.Values   0.0070    NA   NA
Community.Trust           0.0069    NA   NA
See.Family                0.0069    NA   NA
Feel.Close                0.0064    NA   NA
EnglishSpeak              0.0058    NA   NA
Close.knit.Community      0.0050    NA   NA
Income_median             0.0045    NA   NA
Gender                    0.0041    NA   NA
Get.Along                 0.0039    NA   NA
Similar.Values            0.0037    NA   NA
Togetherness              0.0033    NA   NA
Dental.Insurance          0.0025    NA   NA
rfobj$importance
                                all Yes No
Ethnicity               0.024656644  NA NA
Age                     0.007786248  NA NA
Gender                  0.004090596  NA NA
Religion                0.007786248  NA NA
Employment              0.012172708  NA NA
Income_median           0.004463751  NA NA
EnglishSpeak            0.005822910  NA NA
EnglishDiff             0.018241775  NA NA
See.Family              0.006882684  NA NA
Close.Family            0.018638725  NA NA
Helpful.Family          0.002477147  NA NA
See.Friends             0.001179641  NA NA
Close.Friends           0.013519038  NA NA
Helpful.Friends         0.011207441  NA NA
Family.Respect          0.007786248  NA NA
Similar.Values          0.003708850  NA NA
Successful.Family       0.011084516  NA NA
Trust                   0.001841249  NA NA
Loyalty                 0.001179641  NA NA
Family.Pride            0.001841249  NA NA
Expression              0.012857228  NA NA
Spend.Time.Together     0.001205876  NA NA
Feel.Close              0.006424699  NA NA
Togetherness            0.003297697  NA NA
Religious.Attendance    0.013090351  NA NA
Religious.Importance    0.007693639  NA NA
Close.knit.Community    0.005047992  NA NA
Helpful.Community       0.001369863  NA NA
Community.Shares.Values 0.007031368  NA NA
Get.Along               0.003860209  NA NA
Community.Trust         0.006943256  NA NA
Health.Insurance        0.001929871  NA NA
Dental.Insurance        0.002477147  NA NA
Discrimination          0.016650486  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
  
plot(importance_plot)

ggsave(filename="family_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
199.0000000 185.0000000   1.0756757   0.4817708   0.5567568   0.5326633 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5255102   0.5638298   0.4557292   0.2425106   0.9700425   0.5905745 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5406824   0.5442197   0.4817708   0.5710029   0.5426296   0.5443983 
      gmean 
  0.5445768 
test_rf$importance
                                  all Yes No
Ethnicity                8.324236e-03  NA NA
Age                      2.019169e-02  NA NA
Gender                   1.022131e-03  NA NA
Religion                 3.149065e-03  NA NA
Employment              -3.321338e-04  NA NA
Income_median            4.295393e-04  NA NA
EnglishSpeak             2.228164e-03  NA NA
EnglishDiff              1.568637e-03  NA NA
See.Family              -1.910117e-03  NA NA
Close.Family            -1.940504e-03  NA NA
Helpful.Family           6.466441e-03  NA NA
See.Friends              5.652327e-03  NA NA
Close.Friends            2.840038e-04  NA NA
Helpful.Friends         -5.760976e-04  NA NA
Family.Respect          -2.849889e-04  NA NA
Similar.Values          -1.034093e-03  NA NA
Successful.Family        3.978352e-05  NA NA
Trust                   -2.123316e-03  NA NA
Loyalty                  5.968539e-04  NA NA
Family.Pride             2.709423e-04  NA NA
Expression               1.577996e-03  NA NA
Spend.Time.Together      2.257353e-04  NA NA
Feel.Close               2.696780e-04  NA NA
Togetherness             2.825447e-03  NA NA
Religious.Attendance     1.433416e-03  NA NA
Religious.Importance    -3.354434e-03  NA NA
Close.knit.Community    -9.100764e-05  NA NA
Helpful.Community       -1.937361e-03  NA NA
Community.Shares.Values  9.637355e-04  NA NA
Get.Along                2.370701e-03  NA NA
Community.Trust          1.025102e-03  NA NA
Health.Insurance        -5.639420e-05  NA NA
Dental.Insurance         4.379893e-04  NA NA
Discrimination           2.733514e-04  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  

importance_plot

ggsave(filename="family_test_VIMP.png",width=5,height=5,units="in")

Source of Information: Health Professionals

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728
rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imbalanced(Heal.Professionals ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")->rfobj

print(rfobj)
                         Sample size: 1927
           Frequency of class labels: 925, 1002
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 529.8617
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1218
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0832
                   (OOB) Brier score: 0.22739259
        (OOB) Normalized Brier score: 0.90957038
                           (OOB) AUC: 0.67375951
                        (OOB) PR-AUC: 0.63055041
                        (OOB) G-mean: 0.62051216
   (OOB) Requested performance error: 0.37948784

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  562 363      0.3924
       Yes 367 635      0.3663

      (OOB) Misclassification rate: 0.3788272
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
EnglishSpeak               0.0087   NA    NA
Get.Along                  0.0077   NA    NA
Community.Shares.Values    0.0076   NA    NA
Spend.Time.Together        0.0068   NA    NA
Expression                 0.0065   NA    NA
Gender                     0.0064   NA    NA
Age                        0.0056   NA    NA
Similar.Values             0.0049   NA    NA
Health.Insurance           0.0049   NA    NA
Feel.Close                 0.0048   NA    NA
Community.Trust            0.0045   NA    NA
Family.Pride               0.0045   NA    NA
Income_median              0.0041   NA    NA
Discrimination             0.0039   NA    NA
Dental.Insurance           0.0034   NA    NA
Helpful.Community          0.0031   NA    NA
Family.Respect             0.0030   NA    NA
Trust                      0.0026   NA    NA
Close.knit.Community       0.0012   NA    NA
Loyalty                    0.0006   NA    NA
Religious.Importance      -0.0008   NA    NA
Togetherness              -0.0008   NA    NA
Ethnicity                 -0.0018   NA    NA
Religious.Attendance      -0.0021   NA    NA
Religion                  -0.0023   NA    NA
Helpful.Family            -0.0023   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity               -0.0018143542 NA  NA
Age                      0.0056014941 NA  NA
Gender                   0.0063757609 NA  NA
Religion                -0.0022995991 NA  NA
Employment              -0.0024295113 NA  NA
Income_median            0.0041083863 NA  NA
EnglishSpeak             0.0086779884 NA  NA
EnglishDiff             -0.0028554330 NA  NA
See.Family              -0.0040666801 NA  NA
Close.Family            -0.0026329858 NA  NA
Helpful.Family          -0.0023203869 NA  NA
See.Friends             -0.0059329832 NA  NA
Close.Friends           -0.0071864210 NA  NA
Helpful.Friends         -0.0065896944 NA  NA
Family.Respect           0.0029979134 NA  NA
Similar.Values           0.0049175798 NA  NA
Successful.Family       -0.0029246611 NA  NA
Trust                    0.0026337288 NA  NA
Loyalty                  0.0005523029 NA  NA
Family.Pride             0.0045040927 NA  NA
Expression               0.0065145648 NA  NA
Spend.Time.Together      0.0067966620 NA  NA
Feel.Close               0.0048483539 NA  NA
Togetherness            -0.0008427275 NA  NA
Religious.Attendance    -0.0020812857 NA  NA
Religious.Importance    -0.0007880284 NA  NA
Close.knit.Community     0.0012392363 NA  NA
Helpful.Community        0.0030590686 NA  NA
Community.Shares.Values  0.0075557164 NA  NA
Get.Along                0.0077018301 NA  NA
Community.Trust          0.0045040927 NA  NA
Health.Insurance         0.0049053114 NA  NA
Dental.Insurance         0.0033728024 NA  NA
Discrimination           0.0039211271 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(Heal.Professionals=="Yes")
neg <- rfdata |> filter(Heal.Professionals==0)

set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Heal.Professionals~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Heal.Professionals ~ .,importance=T,data=train,
                    perf.type = "gmean",
                    ntree=1000,
                    splitrule="auc")
print(rfobj)
                         Sample size: 1542
           Frequency of class labels: 756, 786
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 322.849
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 975
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0397
                   (OOB) Brier score: 0.17657038
        (OOB) Normalized Brier score: 0.70628152
                           (OOB) AUC: 0.85464797
                        (OOB) PR-AUC: 0.84380916
                        (OOB) G-mean: 0.77511401
   (OOB) Requested performance error: 0.22488599

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 597 159      0.2103
       No  188 598      0.2392

      (OOB) Misclassification rate: 0.2250324
print(rfobj)
                         Sample size: 1542
           Frequency of class labels: 756, 786
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 322.849
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 975
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0397
                   (OOB) Brier score: 0.17657038
        (OOB) Normalized Brier score: 0.70628152
                           (OOB) AUC: 0.85464797
                        (OOB) PR-AUC: 0.84380916
                        (OOB) G-mean: 0.77511401
   (OOB) Requested performance error: 0.22488599

Confusion matrix:

          predicted
  observed Yes  No class.error
       Yes 597 159      0.2103
       No  188 598      0.2392

      (OOB) Misclassification rate: 0.2250324
plot(rfobj,plots.one.page = FALSE)


                             all   Yes   No
EnglishSpeak              0.0311    NA   NA
EnglishDiff               0.0209    NA   NA
Dental.Insurance          0.0201    NA   NA
Ethnicity                 0.0164    NA   NA
Religious.Importance      0.0162    NA   NA
See.Friends               0.0149    NA   NA
Income_median             0.0143    NA   NA
Religion                  0.0130    NA   NA
Community.Trust           0.0123    NA   NA
Close.Family              0.0118    NA   NA
See.Family                0.0117    NA   NA
Successful.Family         0.0110    NA   NA
Get.Along                 0.0104    NA   NA
Close.knit.Community      0.0104    NA   NA
Religious.Attendance      0.0104    NA   NA
Feel.Close                0.0104    NA   NA
Helpful.Family            0.0093    NA   NA
Age                       0.0091    NA   NA
Discrimination            0.0078    NA   NA
Helpful.Friends           0.0071    NA   NA
Helpful.Community         0.0066    NA   NA
Community.Shares.Values   0.0065    NA   NA
Togetherness              0.0058    NA   NA
Family.Respect            0.0058    NA   NA
Similar.Values            0.0058    NA   NA
Employment                0.0045    NA   NA
rfobj$importance
                                 all Yes No
Ethnicity               0.0164029140  NA NA
Age                     0.0090830369  NA NA
Gender                  0.0039245209  NA NA
Religion                0.0130344581  NA NA
Employment              0.0045412390  NA NA
Income_median           0.0143163422  NA NA
EnglishSpeak            0.0311343089  NA NA
EnglishDiff             0.0209034590  NA NA
See.Family              0.0116885920  NA NA
Close.Family            0.0117547266  NA NA
Helpful.Family          0.0092522165  NA NA
See.Friends             0.0149282081  NA NA
Close.Friends           0.0013581594  NA NA
Helpful.Friends         0.0071357674  NA NA
Family.Respect          0.0058385032  NA NA
Similar.Values          0.0058374094  NA NA
Successful.Family       0.0110275602  NA NA
Trust                   0.0039048813  NA NA
Loyalty                 0.0038917886  NA NA
Family.Pride            0.0038939707  NA NA
Expression              0.0012972629  NA NA
Spend.Time.Together     0.0019728304  NA NA
Feel.Close              0.0103847047  NA NA
Togetherness            0.0058385032  NA NA
Religious.Attendance    0.0103913067  NA NA
Religious.Importance    0.0162199437  NA NA
Close.knit.Community    0.0104001093  NA NA
Helpful.Community       0.0066308331  NA NA
Community.Shares.Values 0.0064928827  NA NA
Get.Along               0.0104111128  NA NA
Community.Trust         0.0123248245  NA NA
Health.Insurance        0.0006527058  NA NA
Dental.Insurance        0.0201072957  NA NA
Discrimination          0.0077967363  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
  
plot(importance_plot)

ggsave(filename="healthpro_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
200.0000000 185.0000000   1.0810811   0.4805195   0.6000000   0.6200000 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5935829   0.6262626   0.3896104   0.2335855   0.9343418   0.6500811 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5967742   0.6096605   0.4805195   0.6277839   0.6033461   0.6097893 
      gmean 
  0.6099180 
test_rf$importance
                                  all Yes No
Ethnicity                1.416251e-03  NA NA
Age                      3.833091e-04  NA NA
Gender                   5.709630e-04  NA NA
Religion                 3.265128e-04  NA NA
Employment               5.719352e-05  NA NA
Income_median            3.094068e-03  NA NA
EnglishSpeak             1.868566e-02  NA NA
EnglishDiff              1.095154e-02  NA NA
See.Family               2.481570e-03  NA NA
Close.Family             5.369103e-04  NA NA
Helpful.Family           2.884903e-03  NA NA
See.Friends              1.065651e-03  NA NA
Close.Friends            1.526279e-03  NA NA
Helpful.Friends          4.299457e-04  NA NA
Family.Respect           1.237963e-03  NA NA
Similar.Values          -2.771710e-03  NA NA
Successful.Family       -9.325210e-04  NA NA
Trust                   -1.068838e-03  NA NA
Loyalty                  4.830675e-04  NA NA
Family.Pride             5.179147e-04  NA NA
Expression               1.251865e-03  NA NA
Spend.Time.Together      6.868427e-04  NA NA
Feel.Close               7.449733e-04  NA NA
Togetherness            -2.894697e-04  NA NA
Religious.Attendance     1.826614e-03  NA NA
Religious.Importance     9.029164e-04  NA NA
Close.knit.Community    -3.583688e-05  NA NA
Helpful.Community        1.156417e-03  NA NA
Community.Shares.Values -1.892850e-03  NA NA
Get.Along               -6.693266e-04  NA NA
Community.Trust          2.128981e-03  NA NA
Health.Insurance         3.435879e-03  NA NA
Dental.Insurance         8.937844e-03  NA NA
Discrimination           4.512359e-03  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
importance_plot

ggsave(filename="healthpro_test_VIMP.png",width=5,height=5,units="in")

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1936
           Frequency of class labels: 259, 1677
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 295.831
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1224
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.4749
                   (OOB) Brier score: 0.10519154
        (OOB) Normalized Brier score: 0.42076617
                           (OOB) AUC: 0.7338273
                        (OOB) PR-AUC: 0.32141853
                        (OOB) G-mean: 0.66878662
   (OOB) Requested performance error: 0.33121338

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   203  56      0.2162
       Yes 720 957      0.4293

      (OOB) Misclassification rate: 0.4008264
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1677.0000000  259.0000000    6.4749035    0.1337810    0.7837838    0.5706619 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2199350    0.9447187    0.4008264    0.1051915    0.4207662    0.7338273 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3434856    0.4633100    0.1337810    0.3214185    0.5061361    0.5660483 
       gmean 
   0.6687866 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
# 
# 
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",
                    ntree=1000,
                    splitrule="auc")
print(rfobj)
                         Sample size: 1550
           Frequency of class labels: 760, 790
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 278.29
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 980
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0395
                   (OOB) Brier score: 0.12406003
        (OOB) Normalized Brier score: 0.49624012
                           (OOB) AUC: 0.9650458
                        (OOB) PR-AUC: 0.96253384
                        (OOB) G-mean: 0.89247353
   (OOB) Requested performance error: 0.10752647

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 656 104      0.1368
       0    61 729      0.0772

      (OOB) Misclassification rate: 0.1064516
print(rfobj)
                         Sample size: 1550
           Frequency of class labels: 760, 790
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 278.29
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 980
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0395
                   (OOB) Brier score: 0.12406003
        (OOB) Normalized Brier score: 0.49624012
                           (OOB) AUC: 0.9650458
                        (OOB) PR-AUC: 0.96253384
                        (OOB) G-mean: 0.89247353
   (OOB) Requested performance error: 0.10752647

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 656 104      0.1368
       0    61 729      0.0772

      (OOB) Misclassification rate: 0.1064516
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Religion                  0.0225    NA   NA
EnglishSpeak              0.0222    NA   NA
EnglishDiff               0.0149    NA   NA
Community.Shares.Values   0.0135    NA   NA
Income_median             0.0129    NA   NA
Close.Family              0.0125    NA   NA
Community.Trust           0.0122    NA   NA
Ethnicity                 0.0116    NA   NA
Religious.Attendance      0.0114    NA   NA
Helpful.Community         0.0101    NA   NA
Close.Friends             0.0096    NA   NA
Helpful.Family            0.0092    NA   NA
Employment                0.0089    NA   NA
See.Friends               0.0087    NA   NA
Get.Along                 0.0074    NA   NA
Religious.Importance      0.0074    NA   NA
Age                       0.0073    NA   NA
Family.Respect            0.0067    NA   NA
Expression                0.0052    NA   NA
Helpful.Friends           0.0046    NA   NA
Togetherness              0.0045    NA   NA
Close.knit.Community      0.0041    NA   NA
Family.Pride              0.0039    NA   NA
Feel.Close                0.0037    NA   NA
See.Family                0.0036    NA   NA
Similar.Values            0.0034    NA   NA
rfobj$importance
                                all Yes  0
Ethnicity               0.011608770  NA NA
Age                     0.007319489  NA NA
Gender                  0.003407701  NA NA
Religion                0.022516372  NA NA
Employment              0.008935426  NA NA
Income_median           0.012900201  NA NA
EnglishSpeak            0.022248379  NA NA
EnglishDiff             0.014910657  NA NA
See.Family              0.003558520  NA NA
Close.Family            0.012486547  NA NA
Helpful.Family          0.009229550  NA NA
See.Friends             0.008726205  NA NA
Close.Friends           0.009591684  NA NA
Helpful.Friends         0.004628107  NA NA
Family.Respect          0.006676196  NA NA
Similar.Values          0.003407701  NA NA
Successful.Family       0.002043055  NA NA
Trust                   0.002517355  NA NA
Loyalty                 0.001972914  NA NA
Family.Pride            0.003946460  NA NA
Expression              0.005169485  NA NA
Spend.Time.Together     0.003196971  NA NA
Feel.Close              0.003744036  NA NA
Togetherness            0.004489298  NA NA
Religious.Attendance    0.011370559  NA NA
Religious.Importance    0.007434278  NA NA
Close.knit.Community    0.004117995  NA NA
Helpful.Community       0.010143658  NA NA
Community.Shares.Values 0.013535732  NA NA
Get.Along               0.007434278  NA NA
Community.Trust         0.012243368  NA NA
Discrimination          0.001096147  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename="HIns_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
335.0000000  51.0000000   6.5686275   0.1321244   0.6470588   0.5492537 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1793478   0.9108911   0.4378238   0.1121928   0.4487711   0.6093649 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.2808511   0.3984187   0.1321244   0.1919435   0.4385025   0.4972863 
      gmean 
  0.5961539 
test_rf$importance
                                  all Yes  0
Ethnicity               -0.0007170282  NA NA
Age                     -0.0065876180  NA NA
Gender                   0.0016858270  NA NA
Religion                 0.0032049218  NA NA
Employment               0.0018750855  NA NA
Income_median            0.0266195366  NA NA
EnglishSpeak             0.0087105042  NA NA
EnglishDiff             -0.0040702210  NA NA
See.Family               0.0023935099  NA NA
Close.Family            -0.0060360251  NA NA
Helpful.Family          -0.0037656635  NA NA
See.Friends             -0.0030925577  NA NA
Close.Friends            0.0010820995  NA NA
Helpful.Friends          0.0030769449  NA NA
Family.Respect           0.0017744430  NA NA
Similar.Values          -0.0015062104  NA NA
Successful.Family       -0.0028634419  NA NA
Trust                    0.0005119584  NA NA
Loyalty                 -0.0021556567  NA NA
Family.Pride             0.0005819887  NA NA
Expression               0.0011362520  NA NA
Spend.Time.Together      0.0007929601  NA NA
Feel.Close               0.0002449711  NA NA
Togetherness             0.0028090090  NA NA
Religious.Attendance     0.0053798612  NA NA
Religious.Importance     0.0005212834  NA NA
Close.knit.Community     0.0017700534  NA NA
Helpful.Community       -0.0028534784  NA NA
Community.Shares.Values -0.0011828800  NA NA
Get.Along               -0.0014814092  NA NA
Community.Trust         -0.0024477784  NA NA
Discrimination           0.0008231928  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  

importance_plot

ggsave(filename="HIns_test_VIMP.png",width=5,height=5,units="in")

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1932
           Frequency of class labels: 760, 1172
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 451.0523
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1221
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.5421
                   (OOB) Brier score: 0.17933324
        (OOB) Normalized Brier score: 0.71733296
                           (OOB) AUC: 0.79775743
                        (OOB) PR-AUC: 0.71577963
                        (OOB) G-mean: 0.72787095
   (OOB) Requested performance error: 0.27212905

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   572 188      0.2474
       Yes 347 825      0.2961

      (OOB) Misclassification rate: 0.2769151
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1172.0000000  760.0000000    1.5421053    0.3933747    0.7526316    0.7039249 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6224157    0.8144126    0.2769151    0.1793332    0.7173330    0.7977574 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6813580    0.7163581    0.3933747    0.7157796    0.7046145    0.7221145 
       gmean 
   0.7278710 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
                    perf.type = "gmean",
                    ntree=1000,
                    splitrule="auc")
print(rfobj)
                         Sample size: 1546
           Frequency of class labels: 757, 789
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 285.066
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 977
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0423
                   (OOB) Brier score: 0.13945449
        (OOB) Normalized Brier score: 0.55781796
                           (OOB) AUC: 0.91210887
                        (OOB) PR-AUC: 0.90552285
                        (OOB) G-mean: 0.84478415
   (OOB) Requested performance error: 0.15521585

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 625 132      0.1744
       0   107 682      0.1356

      (OOB) Misclassification rate: 0.1545925
print(rfobj)
                         Sample size: 1546
           Frequency of class labels: 757, 789
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 285.066
No. of variables tried at each split: 6
              Total no. of variables: 32
       Resampling used to grow trees: swor
    Resample size used to grow trees: 977
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0423
                   (OOB) Brier score: 0.13945449
        (OOB) Normalized Brier score: 0.55781796
                           (OOB) AUC: 0.91210887
                        (OOB) PR-AUC: 0.90552285
                        (OOB) G-mean: 0.84478415
   (OOB) Requested performance error: 0.15521585

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 625 132      0.1744
       0   107 682      0.1356

      (OOB) Misclassification rate: 0.1545925
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Income_median             0.0348    NA   NA
EnglishSpeak              0.0324    NA   NA
Age                       0.0303    NA   NA
Ethnicity                 0.0293    NA   NA
Employment                0.0257    NA   NA
Religion                  0.0234    NA   NA
EnglishDiff               0.0226    NA   NA
See.Friends               0.0197    NA   NA
Helpful.Family            0.0184    NA   NA
Close.Family              0.0152    NA   NA
Community.Shares.Values   0.0142    NA   NA
Close.knit.Community      0.0141    NA   NA
Helpful.Community         0.0136    NA   NA
Close.Friends             0.0133    NA   NA
Discrimination            0.0130    NA   NA
See.Family                0.0123    NA   NA
Expression                0.0115    NA   NA
Community.Trust           0.0114    NA   NA
Helpful.Friends           0.0114    NA   NA
Get.Along                 0.0110    NA   NA
Religious.Importance      0.0092    NA   NA
Family.Respect            0.0091    NA   NA
Feel.Close                0.0084    NA   NA
Similar.Values            0.0078    NA   NA
Religious.Attendance      0.0077    NA   NA
Togetherness              0.0077    NA   NA
rfobj$importance
                                all Yes  0
Ethnicity               0.029295012  NA NA
Age                     0.030250260  NA NA
Gender                  0.005238544  NA NA
Religion                0.023372807  NA NA
Employment              0.025687492  NA NA
Income_median           0.034751917  NA NA
EnglishSpeak            0.032383582  NA NA
EnglishDiff             0.022637313  NA NA
See.Family              0.012275885  NA NA
Close.Family            0.015189066  NA NA
Helpful.Family          0.018379898  NA NA
See.Friends             0.019742478  NA NA
Close.Friends           0.013263933  NA NA
Helpful.Friends         0.011443697  NA NA
Family.Respect          0.009066287  NA NA
Similar.Values          0.007771092  NA NA
Successful.Family       0.005800118  NA NA
Trust                   0.005124879  NA NA
Loyalty                 0.006216303  NA NA
Family.Pride            0.006475900  NA NA
Expression              0.011493926  NA NA
Spend.Time.Together     0.005019204  NA NA
Feel.Close              0.008390413  NA NA
Togetherness            0.007715085  NA NA
Religious.Attendance    0.007715085  NA NA
Religious.Importance    0.009184496  NA NA
Close.knit.Community    0.014083837  NA NA
Helpful.Community       0.013571050  NA NA
Community.Shares.Values 0.014247108  NA NA
Get.Along               0.011037954  NA NA
Community.Trust         0.011443697  NA NA
Discrimination          0.012951897  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename="DIns_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
234.0000000 152.0000000   1.5394737   0.3937824   0.8223684   0.6880342 
       prec         npv    misclass       brier  brier.norm         auc 
  0.6313131   0.8563830   0.2590674   0.1835812   0.7343248   0.7969242 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.7142857   0.7378552   0.3937824   0.6985360   0.7332471   0.7450318 
      gmean 
  0.7522085 
test_rf$importance
                                  all Yes  0
Ethnicity                0.0074850819  NA NA
Age                      0.0138419389  NA NA
Gender                   0.0011070968  NA NA
Religion                 0.0006469828  NA NA
Employment               0.0171435775  NA NA
Income_median            0.0667999061  NA NA
EnglishSpeak             0.0225371650  NA NA
EnglishDiff              0.0062544370  NA NA
See.Family              -0.0009065089  NA NA
Close.Family             0.0030245757  NA NA
Helpful.Family           0.0052611352  NA NA
See.Friends              0.0007200479  NA NA
Close.Friends            0.0046701490  NA NA
Helpful.Friends          0.0052013977  NA NA
Family.Respect           0.0003192247  NA NA
Similar.Values           0.0008803404  NA NA
Successful.Family        0.0003647281  NA NA
Trust                    0.0021871845  NA NA
Loyalty                  0.0010393549  NA NA
Family.Pride             0.0023372421  NA NA
Expression               0.0027105884  NA NA
Spend.Time.Together      0.0026602853  NA NA
Feel.Close               0.0011880424  NA NA
Togetherness             0.0011204948  NA NA
Religious.Attendance     0.0020460395  NA NA
Religious.Importance    -0.0002700353  NA NA
Close.knit.Community     0.0013556106  NA NA
Helpful.Community        0.0006672165  NA NA
Community.Shares.Values  0.0013819268  NA NA
Get.Along                0.0034466808  NA NA
Community.Trust          0.0001582721  NA NA
Discrimination          -0.0012461848  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
importance_plot

ggsave(filename="DIns_test_VIMP.png",width=5,height=5,units="in")

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1918
           Frequency of class labels: 614, 1304
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 451.908
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1212
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 2.1238
                   (OOB) Brier score: 0.18414117
        (OOB) Normalized Brier score: 0.73656469
                           (OOB) AUC: 0.74369423
                        (OOB) PR-AUC: 0.55768871
                        (OOB) G-mean: 0.6964641
   (OOB) Requested performance error: 0.3035359

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   432 182      0.2964
       Yes 405 899      0.3106

      (OOB) Misclassification rate: 0.306048
plot(imb,plots.one.page = F)


                              all    0   Yes
Age                        0.0451   NA    NA
Health.Insurance           0.0367   NA    NA
Dental.Insurance           0.0277   NA    NA
Gender                     0.0172   NA    NA
Income_median              0.0074   NA    NA
EnglishDiff                0.0061   NA    NA
Community.Shares.Values    0.0055   NA    NA
Employment                 0.0055   NA    NA
Discrimination             0.0045   NA    NA
Togetherness               0.0043   NA    NA
EnglishSpeak               0.0035   NA    NA
Helpful.Family             0.0032   NA    NA
Close.knit.Community       0.0029   NA    NA
Religious.Importance       0.0023   NA    NA
Close.Family               0.0019   NA    NA
Get.Along                  0.0019   NA    NA
Religion                   0.0016   NA    NA
See.Family                 0.0016   NA    NA
Ethnicity                  0.0012   NA    NA
Loyalty                    0.0008   NA    NA
Family.Respect             0.0000   NA    NA
Trust                     -0.0001   NA    NA
Religious.Attendance      -0.0005   NA    NA
See.Friends               -0.0008   NA    NA
Community.Trust           -0.0008   NA    NA
Family.Pride              -0.0015   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1304.0000000  614.0000000    2.1237785    0.3201251    0.7035831    0.6894172 
        prec          npv     misclass        brier   brier.norm          auc 
   0.5161290    0.8316374    0.3060480    0.1841412    0.7365647    0.7436942 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.5954514    0.6653643    0.3201251    0.5576887    0.6459578    0.6809142 
       gmean 
   0.6964641 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",
                    ntree=1000,
                    splitrule="auc")
print(rfobj)
                         Sample size: 1536
           Frequency of class labels: 754, 782
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 296.945
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 971
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0371
                   (OOB) Brier score: 0.14645696
        (OOB) Normalized Brier score: 0.58582786
                           (OOB) AUC: 0.91847572
                        (OOB) PR-AUC: 0.91302995
                        (OOB) G-mean: 0.84643071
   (OOB) Requested performance error: 0.15356929

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 658  96      0.1273
       0   140 642      0.1790

      (OOB) Misclassification rate: 0.1536458
print(rfobj)
                         Sample size: 1536
           Frequency of class labels: 754, 782
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 296.945
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 971
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0371
                   (OOB) Brier score: 0.14645696
        (OOB) Normalized Brier score: 0.58582786
                           (OOB) AUC: 0.91847572
                        (OOB) PR-AUC: 0.91302995
                        (OOB) G-mean: 0.84643071
   (OOB) Requested performance error: 0.15356929

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 658  96      0.1273
       0   140 642      0.1790

      (OOB) Misclassification rate: 0.1536458
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Age                       0.0306    NA   NA
Ethnicity                 0.0290    NA   NA
Health.Insurance          0.0253    NA   NA
Religion                  0.0209    NA   NA
Discrimination            0.0190    NA   NA
EnglishDiff               0.0178    NA   NA
Gender                    0.0177    NA   NA
Helpful.Family            0.0151    NA   NA
EnglishSpeak              0.0145    NA   NA
Close.Family              0.0131    NA   NA
See.Family                0.0123    NA   NA
Religious.Importance      0.0120    NA   NA
Dental.Insurance          0.0110    NA   NA
Spend.Time.Together       0.0106    NA   NA
See.Friends               0.0092    NA   NA
Religious.Attendance      0.0087    NA   NA
Close.Friends             0.0085    NA   NA
Income_median             0.0085    NA   NA
Helpful.Community         0.0079    NA   NA
Helpful.Friends           0.0072    NA   NA
Close.knit.Community      0.0067    NA   NA
Successful.Family         0.0066    NA   NA
Community.Shares.Values   0.0060    NA   NA
Family.Respect            0.0040    NA   NA
Expression                0.0039    NA   NA
Community.Trust           0.0034    NA   NA
rfobj$importance
                                 all Yes  0
Ethnicity               0.0290438628  NA NA
Age                     0.0305609750  NA NA
Gender                  0.0176675835  NA NA
Religion                0.0209462698  NA NA
Employment              0.0033749309  NA NA
Income_median           0.0084739447  NA NA
EnglishSpeak            0.0145220898  NA NA
EnglishDiff             0.0177954937  NA NA
See.Family              0.0123384853  NA NA
Close.Family            0.0130819988  NA NA
Helpful.Family          0.0150982127  NA NA
See.Friends             0.0092231420  NA NA
Close.Friends           0.0085124007  NA NA
Helpful.Friends         0.0072321580  NA NA
Family.Respect          0.0039877196  NA NA
Similar.Values          0.0006775183  NA NA
Successful.Family       0.0065927686  NA NA
Trust                   0.0032823973  NA NA
Loyalty                 0.0019799576  NA NA
Family.Pride            0.0033749309  NA NA
Expression              0.0039434310  NA NA
Spend.Time.Together     0.0106139711  NA NA
Feel.Close              0.0033025124  NA NA
Togetherness            0.0032823973  NA NA
Religious.Attendance    0.0087107806  NA NA
Religious.Importance    0.0119857772  NA NA
Close.knit.Community    0.0067058637  NA NA
Helpful.Community       0.0078952941  NA NA
Community.Shares.Values 0.0059538655  NA NA
Get.Along               0.0013595936  NA NA
Community.Trust         0.0033749309  NA NA
Health.Insurance        0.0253433804  NA NA
Dental.Insurance        0.0110361377  NA NA
Discrimination          0.0189547483  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename="PChk_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
260.0000000 122.0000000   2.1311475   0.3193717   0.5983607   0.7038462 
       prec         npv    misclass       brier  brier.norm         auc 
  0.4866667   0.7887931   0.3298429   0.1941655   0.7766620   0.6926860 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5367647   0.6235821   0.3193717   0.5100775   0.5928642   0.6362729 
      gmean 
  0.6489637 
test_rf$importance
                                  all Yes  0
Ethnicity               -4.485200e-03  NA NA
Age                      1.456814e-02  NA NA
Gender                   1.332700e-03  NA NA
Religion                 2.523687e-03  NA NA
Employment               1.750327e-04  NA NA
Income_median            5.524786e-03  NA NA
EnglishSpeak            -9.543476e-04  NA NA
EnglishDiff              1.994580e-03  NA NA
See.Family              -2.830507e-03  NA NA
Close.Family            -8.254770e-05  NA NA
Helpful.Family           1.710651e-03  NA NA
See.Friends              1.754683e-03  NA NA
Close.Friends           -1.154867e-03  NA NA
Helpful.Friends          5.813948e-04  NA NA
Family.Respect          -2.349749e-04  NA NA
Similar.Values           1.152939e-04  NA NA
Successful.Family       -1.139116e-03  NA NA
Trust                   -1.046174e-03  NA NA
Loyalty                  4.127064e-04  NA NA
Family.Pride            -9.711867e-07  NA NA
Expression              -8.976143e-04  NA NA
Spend.Time.Together     -3.881086e-04  NA NA
Feel.Close               3.533802e-04  NA NA
Togetherness            -1.157534e-04  NA NA
Religious.Attendance    -1.493394e-03  NA NA
Religious.Importance    -1.103743e-03  NA NA
Close.knit.Community     3.038849e-03  NA NA
Helpful.Community       -2.561196e-03  NA NA
Community.Shares.Values  1.147596e-03  NA NA
Get.Along                2.864171e-03  NA NA
Community.Trust         -1.760672e-03  NA NA
Health.Insurance         2.784247e-02  NA NA
Dental.Insurance         2.954517e-02  NA NA
Discrimination           2.360783e-03  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
importance_plot

ggsave(filename="PChk_test_VIMP.png",width=5,height=5,units="in")

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1915
           Frequency of class labels: 786, 1129
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 472.9973
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1210
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.4364
                   (OOB) Brier score: 0.19312832
        (OOB) Normalized Brier score: 0.77251328
                           (OOB) AUC: 0.76845291
                        (OOB) PR-AUC: 0.66442725
                        (OOB) G-mean: 0.69989128
   (OOB) Requested performance error: 0.30010872

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   566 220      0.2799
       Yes 361 768      0.3198

      (OOB) Misclassification rate: 0.3033943
plot(imb,plots.one.page = F)


                              all    0   Yes
Dental.Insurance           0.0207   NA    NA
Health.Insurance           0.0023   NA    NA
Income_median              0.0014   NA    NA
Helpful.Community          0.0012   NA    NA
EnglishDiff                0.0006   NA    NA
Community.Trust            0.0002   NA    NA
Spend.Time.Together       -0.0006   NA    NA
EnglishSpeak              -0.0006   NA    NA
Helpful.Friends           -0.0011   NA    NA
Employment                -0.0012   NA    NA
Family.Respect            -0.0014   NA    NA
Feel.Close                -0.0015   NA    NA
Togetherness              -0.0016   NA    NA
Loyalty                   -0.0017   NA    NA
Family.Pride              -0.0017   NA    NA
Helpful.Family            -0.0017   NA    NA
Religion                  -0.0020   NA    NA
Community.Shares.Values   -0.0021   NA    NA
Trust                     -0.0023   NA    NA
Close.Friends             -0.0025   NA    NA
Discrimination            -0.0026   NA    NA
Close.knit.Community      -0.0026   NA    NA
See.Family                -0.0028   NA    NA
Expression                -0.0031   NA    NA
Successful.Family         -0.0040   NA    NA
Get.Along                 -0.0042   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1129.0000000  786.0000000    1.4363868    0.4104439    0.7201018    0.6802480 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6105717    0.7773279    0.3033943    0.1931283    0.7725133    0.7684529 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6608290    0.6916811    0.4104439    0.6644272    0.6803601    0.6957862 
       gmean 
   0.6998913 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
                    perf.type = "gmean",
                    ntree=1000,
                    splitrule="auc")
print(rfobj)
                         Sample size: 1533
           Frequency of class labels: 754, 779
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 300.131
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 969
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0332
                   (OOB) Brier score: 0.15189957
        (OOB) Normalized Brier score: 0.60759826
                           (OOB) AUC: 0.89715441
                        (OOB) PR-AUC: 0.8931993
                        (OOB) G-mean: 0.81419411
   (OOB) Requested performance error: 0.18580589

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 622 132      0.1751
       0   153 626      0.1964

      (OOB) Misclassification rate: 0.18591
print(rfobj)
                         Sample size: 1533
           Frequency of class labels: 754, 779
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 300.131
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 969
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0332
                   (OOB) Brier score: 0.15189957
        (OOB) Normalized Brier score: 0.60759826
                           (OOB) AUC: 0.89715441
                        (OOB) PR-AUC: 0.8931993
                        (OOB) G-mean: 0.81419411
   (OOB) Requested performance error: 0.18580589

Confusion matrix:

          predicted
  observed Yes   0 class.error
       Yes 622 132      0.1751
       0   153 626      0.1964

      (OOB) Misclassification rate: 0.18591
plot(rfobj,plots.one.page = FALSE)


                             all   Yes    0
Ethnicity                 0.0347    NA   NA
Religion                  0.0230    NA   NA
Dental.Insurance          0.0197    NA   NA
Religious.Importance      0.0170    NA   NA
Age                       0.0164    NA   NA
EnglishSpeak              0.0163    NA   NA
EnglishDiff               0.0150    NA   NA
See.Friends               0.0130    NA   NA
Religious.Attendance      0.0117    NA   NA
Close.knit.Community      0.0105    NA   NA
Gender                    0.0104    NA   NA
Helpful.Friends           0.0098    NA   NA
Income_median             0.0098    NA   NA
Successful.Family         0.0098    NA   NA
Community.Shares.Values   0.0092    NA   NA
Helpful.Family            0.0091    NA   NA
Spend.Time.Together       0.0091    NA   NA
Feel.Close                0.0085    NA   NA
Community.Trust           0.0085    NA   NA
Family.Pride              0.0078    NA   NA
Expression                0.0072    NA   NA
Trust                     0.0065    NA   NA
Close.Friends             0.0065    NA   NA
Close.Family              0.0065    NA   NA
Togetherness              0.0059    NA   NA
Family.Respect            0.0059    NA   NA
rfobj$importance
                                all Yes  0
Ethnicity               0.034717565  NA NA
Age                     0.016382966  NA NA
Gender                  0.010434258  NA NA
Religion                0.023025972  NA NA
Employment              0.001304812  NA NA
Income_median           0.009794819  NA NA
EnglishSpeak            0.016308280  NA NA
EnglishDiff             0.015007732  NA NA
See.Family              0.001310048  NA NA
Close.Family            0.006519843  NA NA
Helpful.Family          0.009138969  NA NA
See.Friends             0.013048118  NA NA
Close.Friends           0.006520897  NA NA
Helpful.Friends         0.009794819  NA NA
Family.Respect          0.005867704  NA NA
Similar.Values          0.005231874  NA NA
Successful.Family       0.009784236  NA NA
Trust                   0.006520897  NA NA
Loyalty                 0.003927042  NA NA
Family.Pride            0.007834149  NA NA
Expression              0.007185167  NA NA
Spend.Time.Together     0.009130510  NA NA
Feel.Close              0.008509011  NA NA
Togetherness            0.005867704  NA NA
Religious.Attendance    0.011740123  NA NA
Religious.Importance    0.016962553  NA NA
Close.knit.Community    0.010460735  NA NA
Helpful.Community       0.005216090  NA NA
Community.Shares.Values 0.009155888  NA NA
Get.Along               0.004565001  NA NA
Community.Trust         0.008479427  NA NA
Health.Insurance        0.000838941  NA NA
Dental.Insurance        0.019654670  NA NA
Discrimination          0.004562898  NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename="DChk_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
225.0000000 157.0000000   1.4331210   0.4109948   0.7961783   0.7022222 
       prec         npv    misclass       brier  brier.norm         auc 
  0.6510417   0.8315789   0.2591623   0.1869835   0.7479338   0.7908846 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.7163324   0.7382005   0.4109948   0.6824346   0.7320292   0.7429632 
      gmean 
  0.7477260 
test_rf$importance
                                  all Yes  0
Ethnicity                8.791599e-03  NA NA
Age                      9.593166e-03  NA NA
Gender                   2.667960e-03  NA NA
Religion                 8.406042e-03  NA NA
Employment               1.257021e-03  NA NA
Income_median            5.186217e-03  NA NA
EnglishSpeak             1.030861e-02  NA NA
EnglishDiff              6.582020e-03  NA NA
See.Family               4.339230e-03  NA NA
Close.Family             3.508683e-03  NA NA
Helpful.Family           7.960862e-05  NA NA
See.Friends              3.703856e-03  NA NA
Close.Friends            3.804196e-03  NA NA
Helpful.Friends          1.915058e-03  NA NA
Family.Respect           1.003330e-03  NA NA
Similar.Values           2.391774e-03  NA NA
Successful.Family        3.963704e-03  NA NA
Trust                   -6.897584e-04  NA NA
Loyalty                  1.530624e-03  NA NA
Family.Pride             6.049649e-04  NA NA
Expression              -1.119621e-04  NA NA
Spend.Time.Together      1.312060e-03  NA NA
Feel.Close               1.578852e-03  NA NA
Togetherness            -8.745424e-04  NA NA
Religious.Attendance     2.143785e-04  NA NA
Religious.Importance     4.069140e-03  NA NA
Close.knit.Community     1.658804e-03  NA NA
Helpful.Community        5.234839e-04  NA NA
Community.Shares.Values -2.675590e-03  NA NA
Get.Along                1.237687e-03  NA NA
Community.Trust          1.371337e-03  NA NA
Health.Insurance         8.434049e-03  NA NA
Dental.Insurance         7.611210e-02  NA NA
Discrimination           3.339864e-03  NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
importance_plot

ggsave(filename="DChk_test_VIMP.png",width=5,height=5,units="in")

Urgent Care

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1908
           Frequency of class labels: 1594, 314
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 357.6643
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1206
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 5.0764
                   (OOB) Brier score: 0.13517241
        (OOB) Normalized Brier score: 0.54068965
                           (OOB) AUC: 0.59929952
                        (OOB) PR-AUC: 0.23059263
                        (OOB) G-mean: 0.56307797
   (OOB) Requested performance error: 0.43692203

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   818 776      0.4868
       Yes 120 194      0.3822

      (OOB) Misclassification rate: 0.4696017
plot(imb,plots.one.page = F)


                              all    0   Yes
Age                        0.0126   NA    NA
Family.Pride               0.0055   NA    NA
Discrimination             0.0055   NA    NA
Spend.Time.Together        0.0046   NA    NA
Health.Insurance           0.0046   NA    NA
Helpful.Family             0.0039   NA    NA
Close.Family               0.0037   NA    NA
Trust                      0.0022   NA    NA
Similar.Values             0.0022   NA    NA
Dental.Insurance           0.0008   NA    NA
Employment                 0.0003   NA    NA
Togetherness               0.0000   NA    NA
Income_median              0.0000   NA    NA
Helpful.Community         -0.0005   NA    NA
See.Friends               -0.0015   NA    NA
Loyalty                   -0.0021   NA    NA
Close.knit.Community      -0.0021   NA    NA
Expression                -0.0025   NA    NA
Community.Shares.Values   -0.0033   NA    NA
Feel.Close                -0.0036   NA    NA
Family.Respect            -0.0040   NA    NA
Helpful.Friends           -0.0060   NA    NA
Successful.Family         -0.0068   NA    NA
Ethnicity                 -0.0068   NA    NA
EnglishDiff               -0.0071   NA    NA
Gender                    -0.0072   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1594.0000000  314.0000000    5.0764331    0.1645702    0.6178344    0.5131744 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2000000    0.8720682    0.4696017    0.1351724    0.5406897    0.5992995 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3021807    0.4117806    0.1645702    0.2305926    0.4326293    0.4874293 
       gmean 
   0.5630780 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
                    perf.type = "gmean",
                    ntree=1000,
                    splitrule="auc")
print(rfobj)
                         Sample size: 1528
           Frequency of class labels: 751, 777
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 305.7
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 966
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0346
                   (OOB) Brier score: 0.15162729
        (OOB) Normalized Brier score: 0.60650918
                           (OOB) AUC: 0.93551712
                        (OOB) PR-AUC: 0.92339775
                        (OOB) G-mean: 0.86102207
   (OOB) Requested performance error: 0.13897793

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   639 112      0.1491
       Yes 100 677      0.1287

      (OOB) Misclassification rate: 0.1387435
print(rfobj)
                         Sample size: 1528
           Frequency of class labels: 751, 777
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 305.7
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 966
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0346
                   (OOB) Brier score: 0.15162729
        (OOB) Normalized Brier score: 0.60650918
                           (OOB) AUC: 0.93551712
                        (OOB) PR-AUC: 0.92339775
                        (OOB) G-mean: 0.86102207
   (OOB) Requested performance error: 0.13897793

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   639 112      0.1491
       Yes 100 677      0.1287

      (OOB) Misclassification rate: 0.1387435
plot(rfobj,plots.one.page = FALSE)


                             all    0   Yes
Ethnicity                 0.0337   NA    NA
Religion                  0.0258   NA    NA
See.Friends               0.0235   NA    NA
Age                       0.0227   NA    NA
EnglishSpeak              0.0226   NA    NA
Community.Shares.Values   0.0221   NA    NA
Religious.Attendance      0.0167   NA    NA
Close.Friends             0.0163   NA    NA
Income_median             0.0162   NA    NA
Dental.Insurance          0.0160   NA    NA
Get.Along                 0.0148   NA    NA
Helpful.Community         0.0135   NA    NA
EnglishDiff               0.0134   NA    NA
Discrimination            0.0122   NA    NA
Close.knit.Community      0.0121   NA    NA
Togetherness              0.0117   NA    NA
Religious.Importance      0.0103   NA    NA
Loyalty                   0.0095   NA    NA
Community.Trust           0.0084   NA    NA
Helpful.Friends           0.0083   NA    NA
Similar.Values            0.0076   NA    NA
Family.Pride              0.0076   NA    NA
Trust                     0.0075   NA    NA
Close.Family              0.0075   NA    NA
Helpful.Family            0.0073   NA    NA
Spend.Time.Together       0.0058   NA    NA
rfobj$importance
                                  all  0 Yes
Ethnicity                0.0336813046 NA  NA
Age                      0.0227095880 NA  NA
Gender                   0.0032194041 NA  NA
Religion                 0.0258236476 NA  NA
Employment               0.0024545883 NA  NA
Income_median            0.0161892702 NA  NA
EnglishSpeak             0.0225562836 NA  NA
EnglishDiff              0.0133865189 NA  NA
See.Family              -0.0001910503 NA  NA
Close.Family             0.0075445536 NA  NA
Helpful.Family           0.0072624871 NA  NA
See.Friends              0.0235000617 NA  NA
Close.Friends            0.0163150446 NA  NA
Helpful.Friends          0.0082827844 NA  NA
Family.Respect           0.0036490383 NA  NA
Similar.Values           0.0076168419 NA  NA
Successful.Family        0.0051023980 NA  NA
Trust                    0.0075445536 NA  NA
Loyalty                  0.0095155977 NA  NA
Family.Pride             0.0075957572 NA  NA
Expression               0.0030955495 NA  NA
Spend.Time.Together      0.0058044547 NA  NA
Feel.Close               0.0018141052 NA  NA
Togetherness             0.0116808576 NA  NA
Religious.Attendance     0.0166752282 NA  NA
Religious.Importance     0.0103401445 NA  NA
Close.knit.Community     0.0121198210 NA  NA
Helpful.Community        0.0135199656 NA  NA
Community.Shares.Values  0.0220679404 NA  NA
Get.Along                0.0148292625 NA  NA
Community.Trust          0.0083611648 NA  NA
Health.Insurance         0.0011182957 NA  NA
Dental.Insurance         0.0160168680 NA  NA
Discrimination           0.0121682723 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
  
plot(importance_plot)

ggsave(filename="UC_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
318.0000000  62.0000000   5.1290323   0.1631579   0.4516129   0.6226415 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1891892   0.8534483   0.4052632   0.1379955   0.5519820   0.5185636 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.2666667   0.3891892   0.1631579   0.1733409   0.3984715   0.4597327 
      gmean 
  0.5302763 
test_rf$importance
                                  all  0 Yes
Ethnicity               -5.484342e-03 NA  NA
Age                      9.448379e-03 NA  NA
Gender                  -1.361491e-03 NA  NA
Religion                -4.088015e-03 NA  NA
Employment               2.721016e-03 NA  NA
Income_median            1.204174e-03 NA  NA
EnglishSpeak             3.732555e-03 NA  NA
EnglishDiff              5.565420e-04 NA  NA
See.Family               3.524575e-03 NA  NA
Close.Family            -1.506214e-03 NA  NA
Helpful.Family           2.415261e-03 NA  NA
See.Friends             -1.858872e-03 NA  NA
Close.Friends            1.177293e-02 NA  NA
Helpful.Friends         -1.709392e-03 NA  NA
Family.Respect           1.392512e-03 NA  NA
Similar.Values          -7.695964e-04 NA  NA
Successful.Family       -3.196832e-04 NA  NA
Trust                   -4.241532e-05 NA  NA
Loyalty                  1.841446e-03 NA  NA
Family.Pride            -7.113295e-04 NA  NA
Expression              -2.343188e-03 NA  NA
Spend.Time.Together     -1.792190e-03 NA  NA
Feel.Close              -1.040139e-03 NA  NA
Togetherness            -1.837450e-03 NA  NA
Religious.Attendance    -3.902555e-03 NA  NA
Religious.Importance    -3.452213e-03 NA  NA
Close.knit.Community     3.846090e-03 NA  NA
Helpful.Community        1.723408e-03 NA  NA
Community.Shares.Values -2.957451e-04 NA  NA
Get.Along               -3.175906e-03 NA  NA
Community.Trust          3.151664e-03 NA  NA
Health.Insurance         7.437195e-04 NA  NA
Dental.Insurance        -6.805305e-05 NA  NA
Discrimination          -7.013379e-03 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  

importance_plot

ggsave(filename="UC_test_VIMP.png",width=5,height=5,units="in")

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)

imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="auc")
print(imb)
                         Sample size: 1899
           Frequency of class labels: 1642, 257
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 306.211
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1200
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.3891
                   (OOB) Brier score: 0.11175568
        (OOB) Normalized Brier score: 0.44702273
                           (OOB) AUC: 0.67616364
                        (OOB) PR-AUC: 0.23627522
                        (OOB) G-mean: 0.62877744
   (OOB) Requested performance error: 0.37122256

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   970 672      0.4093
       Yes  85 172      0.3307

      (OOB) Misclassification rate: 0.3986309
plot(imb,plots.one.page = F)


                           all    0   Yes
Age                     0.0299   NA    NA
Ethnicity               0.0196   NA    NA
Helpful.Friends         0.0083   NA    NA
EnglishSpeak            0.0072   NA    NA
Togetherness            0.0066   NA    NA
Feel.Close              0.0062   NA    NA
Family.Pride            0.0061   NA    NA
Family.Respect          0.0060   NA    NA
Religion                0.0060   NA    NA
Close.knit.Community    0.0051   NA    NA
Community.Trust         0.0050   NA    NA
Trust                   0.0047   NA    NA
Close.Friends           0.0046   NA    NA
EnglishDiff             0.0046   NA    NA
See.Friends             0.0044   NA    NA
Employment              0.0042   NA    NA
Religious.Importance    0.0034   NA    NA
Dental.Insurance        0.0031   NA    NA
Loyalty                 0.0031   NA    NA
Get.Along               0.0026   NA    NA
Health.Insurance        0.0019   NA    NA
Similar.Values          0.0019   NA    NA
See.Family              0.0018   NA    NA
Expression              0.0011   NA    NA
Helpful.Community       0.0009   NA    NA
Helpful.Family         -0.0002   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1642.0000000  257.0000000    6.3891051    0.1353344    0.6692607    0.5907430 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2037915    0.9194313    0.3986309    0.1117557    0.4470227    0.6761636 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3124432    0.4356551    0.1353344    0.2362752    0.4706103    0.5322163 
       gmean 
   0.6287774 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "black") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
                          data=imbal_train,
                          seed=3)$data
test<- rfdata[-imbal_index,]

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
                    perf.type = "gmean",
                    ntree=1000,
                    splitrule="auc")
print(rfobj)
                         Sample size: 1520
           Frequency of class labels: 747, 773
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 289.387
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 961
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0348
                   (OOB) Brier score: 0.13528264
        (OOB) Normalized Brier score: 0.54113056
                           (OOB) AUC: 0.94598402
                        (OOB) PR-AUC: 0.94457272
                        (OOB) G-mean: 0.87279673
   (OOB) Requested performance error: 0.12720327

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   632 115      0.1539
       Yes  77 696      0.0996

      (OOB) Misclassification rate: 0.1263158
print(rfobj)
                         Sample size: 1520
           Frequency of class labels: 747, 773
                     Number of trees: 1000
           Forest terminal node size: 1
       Average no. of terminal nodes: 289.387
No. of variables tried at each split: 6
              Total no. of variables: 34
       Resampling used to grow trees: swor
    Resample size used to grow trees: 961
                            Analysis: RFQ
                              Family: class
                      Splitting rule: auc *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0348
                   (OOB) Brier score: 0.13528264
        (OOB) Normalized Brier score: 0.54113056
                           (OOB) AUC: 0.94598402
                        (OOB) PR-AUC: 0.94457272
                        (OOB) G-mean: 0.87279673
   (OOB) Requested performance error: 0.12720327

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   632 115      0.1539
       Yes  77 696      0.0996

      (OOB) Misclassification rate: 0.1263158
plot(rfobj,plots.one.page = FALSE)


                             all    0   Yes
Ethnicity                 0.0279   NA    NA
EnglishSpeak              0.0202   NA    NA
Discrimination            0.0178   NA    NA
Age                       0.0163   NA    NA
Community.Shares.Values   0.0141   NA    NA
Family.Pride              0.0132   NA    NA
Religion                  0.0131   NA    NA
EnglishDiff               0.0127   NA    NA
Religious.Importance      0.0125   NA    NA
See.Friends               0.0103   NA    NA
Helpful.Community         0.0094   NA    NA
Religious.Attendance      0.0093   NA    NA
Helpful.Family            0.0091   NA    NA
Community.Trust           0.0083   NA    NA
Feel.Close                0.0082   NA    NA
Close.Friends             0.0076   NA    NA
Get.Along                 0.0075   NA    NA
Gender                    0.0057   NA    NA
Close.knit.Community      0.0047   NA    NA
Family.Respect            0.0036   NA    NA
Health.Insurance          0.0032   NA    NA
Helpful.Friends           0.0030   NA    NA
Trust                     0.0026   NA    NA
Togetherness              0.0025   NA    NA
Spend.Time.Together       0.0022   NA    NA
Dental.Insurance          0.0019   NA    NA
rfobj$importance
                                  all  0 Yes
Ethnicity                0.0279362812 NA  NA
Age                      0.0162905709 NA  NA
Gender                   0.0057173663 NA  NA
Religion                 0.0131471592 NA  NA
Employment              -0.0009241508 NA  NA
Income_median           -0.0009241508 NA  NA
EnglishSpeak             0.0202120121 NA  NA
EnglishDiff              0.0126768900 NA  NA
See.Family               0.0000000000 NA  NA
Close.Family             0.0001905046 NA  NA
Helpful.Family           0.0090983789 NA  NA
See.Friends              0.0102992711 NA  NA
Close.Friends            0.0076098662 NA  NA
Helpful.Friends          0.0030271957 NA  NA
Family.Respect           0.0035550006 NA  NA
Similar.Values           0.0009211578 NA  NA
Successful.Family        0.0018234059 NA  NA
Trust                    0.0025723455 NA  NA
Loyalty                  0.0018234059 NA  NA
Family.Pride             0.0132408410 NA  NA
Expression               0.0004485458 NA  NA
Spend.Time.Together      0.0021893411 NA  NA
Feel.Close               0.0082416201 NA  NA
Togetherness             0.0025116505 NA  NA
Religious.Attendance     0.0093179662 NA  NA
Religious.Importance     0.0125077768 NA  NA
Close.knit.Community     0.0047312659 NA  NA
Helpful.Community        0.0093620909 NA  NA
Community.Shares.Values  0.0141419161 NA  NA
Get.Along                0.0075057862 NA  NA
Community.Trust          0.0082957061 NA  NA
Health.Insurance         0.0032004397 NA  NA
Dental.Insurance         0.0019447044 NA  NA
Discrimination           0.0177739413 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

ggsave(filename="Folk_train_VIMP.png",width=5,height=5,units="in")

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf)
 n.majority  n.minority      iratio   threshold        sens        spec 
328.0000000  51.0000000   6.4313725   0.1345646   0.5686275   0.5823171 
       prec         npv    misclass       brier  brier.norm         auc 
  0.1746988   0.8967136   0.4195251   0.1160561   0.4642243   0.5974414 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.2672811   0.3877765   0.1345646   0.1643881   0.4213563   0.4816040 
      gmean 
  0.5754316 
test_rf$importance
                                  all  0 Yes
Ethnicity                6.012949e-03 NA  NA
Age                      2.759423e-02 NA  NA
Gender                   2.741019e-04 NA  NA
Religion                -1.157462e-03 NA  NA
Employment               6.041262e-03 NA  NA
Income_median            2.603965e-03 NA  NA
EnglishSpeak             1.060551e-02 NA  NA
EnglishDiff              3.417156e-03 NA  NA
See.Family              -2.539399e-03 NA  NA
Close.Family            -7.752321e-04 NA  NA
Helpful.Family          -6.497141e-03 NA  NA
See.Friends             -1.467569e-04 NA  NA
Close.Friends            4.107855e-03 NA  NA
Helpful.Friends         -1.311790e-03 NA  NA
Family.Respect           3.383756e-05 NA  NA
Similar.Values           1.622703e-03 NA  NA
Successful.Family       -6.023257e-04 NA  NA
Trust                   -3.814687e-03 NA  NA
Loyalty                 -2.942095e-03 NA  NA
Family.Pride            -9.649487e-04 NA  NA
Expression              -8.642067e-04 NA  NA
Spend.Time.Together      2.216063e-03 NA  NA
Feel.Close               2.218099e-03 NA  NA
Togetherness            -7.018356e-04 NA  NA
Religious.Attendance     2.815081e-04 NA  NA
Religious.Importance    -2.434106e-03 NA  NA
Close.knit.Community    -2.458493e-04 NA  NA
Helpful.Community       -1.205382e-03 NA  NA
Community.Shares.Values -1.880313e-03 NA  NA
Get.Along               -1.524397e-03 NA  NA
Community.Trust         -1.668409e-03 NA  NA
Health.Insurance        -4.654616e-04 NA  NA
Dental.Insurance        -2.823929e-03 NA  NA
Discrimination          -1.283090e-04 NA  NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)|> 
  mutate(fill = case_when(variable=="Ethnicity"~"red",
                                                 .default="black"))

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance,fill = fill))+
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw() + 
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
importance_plot

ggsave(filename="Folk_test_VIMP.png",width=5,height=5,units="in")